Around the world, people are spending an increasing amount of time on their mobile devices for email, social networking, banking and a whole range of other activities. But typing on mobile devices can be a serious pain. SwiftKey, our corporate partner in this capstone, builds a smart keyboard that makes it easier for people to type on their mobile devices. One cornerstone of their smart keyboard is predictive text models. When someone types:
I went to the
the keyboard presents three options for what the next word might be. For example, the three words might be gym, store, restaurant. In this capstone you will work on understanding and building predictive text models like those used by SwiftKey.
The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.
library(knitr)
library(stringr)
library(dplyr)
library(quanteda)
library(readtext)
library(R.utils)
library(ggplot2)
set.seed(3301)
downloadData <- function(workingDataPath = file.path("data")) {
# Download Data
rawDataFileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
downloadedZipfilePath <- file.path(workingDataPath, "Coursera-SwiftKey.zip")
badWordsFileUrl <- "https://www.cs.cmu.edu/~biglou/resources/bad-words.txt"
badWordsFilePath <- file.path(workingDataPath, "bad-words.txt")
basePath <- file.path(workingDataPath, "final", "en_US")
# Create working directory
if(!file.exists(workingDataPath)) {
dir.create(workingDataPath)
}
# Download ziped file
if(!file.exists(downloadedZipfilePath)) {
download.file(rawDataFileUrl, destfile = downloadedZipfilePath, method = "curl")
}
# Download Bad Words Text File
if(!file.exists(badWordsFilePath)) {
download.file(badWordsFileUrl, destfile = badWordsFilePath, method = "curl")
}
# Unzip
if(!file.exists(basePath)) {
unzip(zipfile = downloadedZipfilePath, exdir = workingDataPath)
}
list(blogs = file.path(basePath, "en_US.blogs.txt"),
twitter = file.path(basePath, "en_US.twitter.txt"),
news = file.path(basePath, "en_US.news.txt"),
badwords = badWordsFilePath)
}
dataPath <- file.path("..", "data")
attach(downloadData(dataPath))
c(blogs, twitter, news, badwords)
## [1] "../data/final/en_US/en_US.blogs.txt"
## [2] "../data/final/en_US/en_US.twitter.txt"
## [3] "../data/final/en_US/en_US.news.txt"
## [4] "../data/bad-words.txt"
Since the original data is very large and takes time to analyse, it samples to 1% of data.
subSample <- function(input, output) {
if(!file.exists(output)) {
subSamplingRate <- .01
fileLines <- as.numeric(countLines(input))
flipABiasedCoin <- rbinom(fileLines, size = 1, prob = subSamplingRate)
conRead <- file(input, "r")
conWrite <- file(output, "w")
len <- 0
while (length(oneLine <- readLines(conRead, 1, warn = FALSE)) > 0) {
len <- len + 1
if(flipABiasedCoin[len] == 1) {
writeLines(oneLine, conWrite)
}
}
close(conRead)
close(conWrite)
}
return(as.numeric(countLines(output)))
}
blogsSubSampling <- file.path(dataPath, "sub-sample.blogs.txt")
subBlogs <- subSample(blogs, blogsSubSampling)
twitterSubSampling <- file.path(dataPath, "sub-sample.twitter.txt")
subTweets <- subSample(twitter, twitterSubSampling)
newsSubSampling <- file.path(dataPath, "sub-sample.news.txt")
subNews <- subSample(news, newsSubSampling)
c(subBlogs, subTweets, subNews)
## [1] 23673 23662 10161
Divide the sampled data into a training set and a test set at a ratio of 7: 3.
devideDataset <- function(input, outputTrain, outputTest) {
if(!file.exists(outputTrain) || !file.exists(outputTest)) {
trainRate <- .7
fileLines <- as.numeric(countLines(input))
flipABiasedCoin <- rbinom(fileLines, size = 1, prob = trainRate)
conRead <- file(input, "r")
conWriteTrain <- file(outputTrain, "w")
conWriteTest <- file(outputTest, "w")
len <- 0
while (length(oneLine <- readLines(conRead, 1, warn = FALSE)) > 0) {
len <- len + 1
if(flipABiasedCoin[len] == 1) {
writeLines(oneLine, conWriteTrain)
} else {
writeLines(oneLine, conWriteTest)
}
}
close(conRead)
close(conWriteTrain)
close(conWriteTest)
}
return(as.numeric(countLines(outputTrain)))
}
blogsTrain <- paste0(blogsSubSampling, ".train.txt")
blogsTest <- paste0(blogsSubSampling, ".test.txt")
twitterTrain <- paste0(twitterSubSampling, ".train.txt")
twitterTest <- paste0(twitterSubSampling, ".test.txt")
newsTrain <- paste0(newsSubSampling, ".train.txt")
newsTest <- paste0(newsSubSampling, ".test.txt")
trainBlogs <- devideDataset(blogsSubSampling, blogsTrain, blogsTest)
trainTweets <- devideDataset(twitterSubSampling, twitterTrain, twitterTest)
trainNews <- devideDataset(newsSubSampling, newsTrain, newsTest)
c(trainBlogs, trainTweets, trainNews)
## [1] 16598 16525 7185
Loading files using the readtext package and make corpus using the quanteda package.
blogsCorpus <- readtext(blogsTrain) %>%
corpus()
twitterCorpus <- readtext(twitterTrain) %>%
corpus()
newsCorpus <- readtext(newsTrain) %>%
corpus()
profanity <- readLines(badwords)
| Text | Lines | Sentences | Tokens | Types |
|---|---|---|---|---|
| sub-sample.blogs.txt.train.txt | 16598 | 18034 | 256841 | 27642 |
| sub-sample.twitter.txt.train.txt | 16525 | 18175 | 256447 | 27506 |
| sub-sample.news.txt.train.txt | 7185 | 13325 | 285258 | 31187 |
twitterToken <- twitterCorpus %>%
# nomarize words
tokens(remove_punct = TRUE,
remove_numbers = TRUE,
remove_url = TRUE,
include_docvars = FALSE) %>%
# removing profanity and other words
tokens_remove(profanity)
twitterDfm <- twitterToken %>%
dfm()
twitterDfm %>%
topfeatures(20)
## the to i a you and for in of is it my on that me
## 6424 5335 5019 4284 3798 3073 2681 2585 2566 2458 2152 2117 1970 1603 1478
## at be with your have
## 1279 1249 1207 1197 1190
twitterDfm %>%
textplot_wordcloud(min_count = 6,
random_order = FALSE,
rotation = .25,
color = RColorBrewer::brewer.pal(8, "Dark2"))
twitterDfmNoStopWord <- twitterToken %>%
# removing stop words
tokens_remove(stopwords('english')) %>%
dfm()
twitterDfmNoStopWord %>%
topfeatures(20)
## just like get good love day can thanks now rt
## 1049 849 809 734 709 648 633 610 610 595
## one know time u new great today go lol see
## 579 557 543 537 533 519 518 476 463 460
twitterDfmNoStopWord %>%
textplot_wordcloud(min_count = 6,
random_order = FALSE,
max_words = 100,
rotation = .25,
color = RColorBrewer::brewer.pal(8, "Dark2"))
featuresTwitter <- twitterDfmNoStopWord %>%
textstat_frequency(n = 80)
# Sort by reverse frequency order
featuresTwitter$feature <- featuresTwitter %>%
with(reorder(feature, -frequency))
featuresTwitter %>%
ggplot(aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
twitterDfm2Gram <- twitterToken %>%
tokens_ngrams(n = 2) %>%
dfm()
twitterDfm2Gram %>%
topfeatures(20)
## in_the for_the of_the on_the to_be thanks_for
## 531 495 430 343 298 288
## to_the at_the if_you thank_you i_love i_am
## 281 262 249 243 241 227
## i_have have_a for_a going_to i_was will_be
## 223 220 215 201 187 183
## to_see want_to
## 181 181
twitterDfm2Gram %>%
textplot_wordcloud(min_count = 6,
random_order = FALSE,
max_words = 100,
rotation = .25,
color = RColorBrewer::brewer.pal(8, "Dark2"))
featuresTwitter2Gram <- twitterDfm2Gram %>%
textstat_frequency(n = 80)
# Sort by reverse frequency order
featuresTwitter2Gram$feature <- featuresTwitter2Gram %>%
with(reorder(feature, -frequency))
featuresTwitter2Gram %>%
ggplot(aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
twitterDfm3Gram <- twitterToken %>%
tokens_ngrams(n = 3) %>%
dfm()
twitterDfm3Gram %>%
topfeatures(20)
## thanks_for_the thank_you_for i_love_you
## 151 62 60
## looking_forward_to can't_wait_to i_want_to
## 59 54 53
## one_of_the for_the_follow going_to_be
## 51 49 45
## a_lot_of have_a_great i_need_to
## 45 42 40
## to_see_you i_don't_know i_have_to
## 38 37 37
## is_going_to to_be_a let_me_know
## 34 34 34
## you_want_to you_have_a
## 34 33
twitterDfm3Gram %>%
textplot_wordcloud(#min_count = 4,
random_order = FALSE,
max_words = 50,
rotation = .25,
color = RColorBrewer::brewer.pal(8, "Dark2"))
featuresTwitter3Gram <- twitterDfm3Gram %>%
textstat_frequency(60)
# Sort by reverse frequency order
featuresTwitter3Gram$feature <- featuresTwitter3Gram %>%
with(reorder(feature, -frequency))
featuresTwitter3Gram %>%
ggplot(aes(x = feature, y = frequency)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
featuresTwitterFull <- twitterDfmNoStopWord %>% textstat_frequency()
summary(featuresTwitterFull)
## feature frequency rank docfreq
## Length:20671 Min. : 1.000 Min. : 1 Min. :1
## Class :character 1st Qu.: 1.000 1st Qu.: 5168 1st Qu.:1
## Mode :character Median : 1.000 Median :10336 Median :1
## Mean : 5.565 Mean :10336 Mean :1
## 3rd Qu.: 3.000 3rd Qu.:15504 3rd Qu.:1
## Max. :1049.000 Max. :20671 Max. :1
## group
## Length:20671
## Class :character
## Mode :character
##
##
##
quantile(featuresTwitterFull$frequency, c(0, .1, .5, .9, 1))
## 0% 10% 50% 90% 100%
## 1 1 1 8 1049
featuresTwitter2GramFull <- twitterDfm2Gram %>% textstat_frequency()
summary(featuresTwitter2GramFull)
## feature frequency rank docfreq
## Length:119334 Min. : 1.000 Min. : 1 Min. :1
## Class :character 1st Qu.: 1.000 1st Qu.: 29834 1st Qu.:1
## Mode :character Median : 1.000 Median : 59668 Median :1
## Mean : 1.706 Mean : 59668 Mean :1
## 3rd Qu.: 1.000 3rd Qu.: 89501 3rd Qu.:1
## Max. :531.000 Max. :119334 Max. :1
## group
## Length:119334
## Class :character
## Mode :character
##
##
##
quantile(featuresTwitter2GramFull$frequency, c(0, .1, .5, .9, 1))
## 0% 10% 50% 90% 100%
## 1 1 1 2 531
featuresTwitter3GramFull <- twitterDfm3Gram %>% textstat_frequency()
summary(featuresTwitter3GramFull)
## feature frequency rank docfreq
## Length:184636 Min. : 1.000 Min. : 1 Min. :1
## Class :character 1st Qu.: 1.000 1st Qu.: 46160 1st Qu.:1
## Mode :character Median : 1.000 Median : 92318 Median :1
## Mean : 1.103 Mean : 92318 Mean :1
## 3rd Qu.: 1.000 3rd Qu.:138477 3rd Qu.:1
## Max. :151.000 Max. :184636 Max. :1
## group
## Length:184636
## Class :character
## Mode :character
##
##
##
quantile(featuresTwitter3GramFull$frequency, c(0, .1, .5, .9, 1))
## 0% 10% 50% 90% 100%
## 1 1 1 1 151
Seems to be in accordance with the Zipf’s law.
Concept implementation.
simpleGoodTuring <- function(r, Nr, sd = 1.65) {
# number of words
N <- sum(r * Nr)
d <- diff(r)
## Turing estimate
# turing estimate index
ti <- which(d == 1)
# discount coefficients of Turing estimate
dct <- numeric(length(r))
dct[ti] <- (r[ti] + 1) / r[ti] * c(Nr[-1], 0)[ti] / Nr[ti]
## Linear Good-Turing estimate
Zr <- Nr / c(1, 0.5 * (d[-1] + d[-length(d)]), d[length(d)])
f <- lsfit(log(r), log(Zr))
coef <- f$coef
# corrected term frequency
rc <- r * (1 + 1 / r)^(1 + coef[2])
# discount coefficients of Linear Good-Turing estimate
dclgt <- rc / r
## make switch from Turing to LGT estimates
# standard deviation of term frequencies between 'r' and 'rc' (?)
rsd <- rep(1,length(r))
rsd[ti] <- (seq_len(length(r))[ti] + 1) / Nr[ti] * sqrt(Nr[ti + 1] * (1 + Nr[ti + 1] / Nr[ti]))
dc <- dct
for (i in 1:length(r)) {
if (abs(dct[i] - dclgt[i]) * r[i] / rsd[i] <= sd) {
dc[i:length(dc)] <- dclgt[i:length(dc)]
break
}
}
## renormalize the probabilities for observed objects
# summation of probabilities
sump <- sum(dc * r * Nr) / N
# renormalized discount coefficients
dcr <- (1 - Nr[1] / N) * dc / sump
# term frequency
tf <- c(Nr[1] / N, r * dcr)
p <- c(Nr[1] / N, r * dcr / N)
names(p) <- names(tf) <- c(0, r)
list(p = p, r = tf)
}
sgtFactory <- function() {
NrTbl1 <- textstat_frequency(twitterDfm) %>%
select(frequency) %>%
mutate(freqOfFrequency = 1) %>%
group_by(frequency) %>%
summarise_all(sum)
SGT1 <- simpleGoodTuring(NrTbl1$frequency, NrTbl1$freqOfFrequency)
NrTbl2 <- textstat_frequency(twitterDfm2Gram) %>%
select(frequency) %>%
mutate(freqOfFrequency = 1) %>%
group_by(frequency) %>%
summarise_all(sum)
SGT2 <- simpleGoodTuring(NrTbl2$frequency, NrTbl2$freqOfFrequency)
NrTbl3 <- textstat_frequency(twitterDfm3Gram) %>%
select(frequency) %>%
mutate(freqOfFrequency = 1) %>%
group_by(frequency) %>%
summarise_all(sum)
SGT3 <- simpleGoodTuring(NrTbl3$frequency, NrTbl3$freqOfFrequency)
c(
dUnigram = function(freq) {
SGT1$p[as.character(freq)]
},
dBigram = function(freq) {
SGT2$r[as.character(freq)] / freq
},
dTrigram = function(freq) {
SGT3$r[as.character(freq)] / freq
}
)
}
SGT <- sgtFactory()
predictModel <- c(SGT = SGT,
trigram = twitterDfm3Gram,
bigram = twitterDfm2Gram,
unigram = twitterDfm)
# save(predictModel, file = "predictModel.rda")
nextWords <- function(input, predictModel, outputs = 3, k = 0) {
# k is the least important of the parameters. It is usually chosen to be 0.
# However, empirical testing may find better values for k.
inputs <- str_split(input, "\\s+")[[1]]
inputsSize <- length(inputs)
if (inputsSize > 1) {
preTriGram <- paste(inputs[inputsSize - 1],
inputs[inputsSize],
sep = "_")
nextWordDfm <- dfm_select(predictModel$trigram,
paste0(preTriGram, "_*"))
} else {
if (inputs == "") { return() }
nextWordDfm <- NULL
}
preBiGram <- inputs[inputsSize]
# extract n-gram that starts with input
featuresNextWord <- NULL
if (length(nextWordDfm) > k) {
prevWordDfm <- dfm_select(predictModel$bigram,
phrase(preTriGram))
prevWordFreq <- textstat_frequency(prevWordDfm)$frequency
# data frame
featuresNextWord <-
textstat_frequency(nextWordDfm) %>%
mutate(p_bo = predictModel$SGT.dTrigram(frequency) * frequency / prevWordFreq)
# human readable outputs
featuresNextWord$feature <-
sapply(as.vector(featuresNextWord$feature),
function(x) {
str_split(x, "_")[[1]][3]
})
# Sort by reverse frequency order
featuresNextWord$feature <-
with(featuresNextWord,
reorder(feature,-p_bo))
} else {
nextWordDfm <- dfm_select(predictModel$bigram,
paste0(preBiGram, "_*"))
if (length(nextWordDfm) > k) {
prevWordDfm <- dfm_select(predictModel$unigram,
phrase(preBiGram))
prevWordFreq <- textstat_frequency(prevWordDfm)$frequency
# data frame
featuresNextWord <-
textstat_frequency(nextWordDfm) %>%
mutate(p_bo = predictModel$SGT.dBigram(frequency) * frequency / prevWordFreq)
# human readable outputs
featuresNextWord$feature <-
sapply(as.vector(featuresNextWord$feature),
function(x) {
str_split(x, "_")[[1]][2]
})
# Sort by reverse frequency order
featuresNextWord$feature <-
with(featuresNextWord,
reorder(feature,-p_bo))
} else {
nextWordDfm <- twitterDfm
featuresNextWord <-
textstat_frequency(nextWordDfm) %>%
mutate(p_bo = predictModel$SGT.dUnigram(frequency))
# Sort by reverse frequency order
featuresNextWord$feature <-
with(featuresNextWord,
reorder(feature,-p_bo))
}
}
featuresNextWord %>% slice(1:outputs)
}
# load("predictModel.rda")
ggplot(nextWords("I went to be", predictModel), aes(x = feature, y = p_bo)) +
geom_bar(stat = "identity") +
xlab("Predicted next word") + ylab("P_bo")
system.time(nextWords("I went to be", predictModel))
## user system elapsed
## 2.813 0.152 2.997
The result of executing the implemented model is as described above. I have not verified the prediction accuracy, but I think that the prediction works well. It takes a long time to predict the next word. So, next, tune the execution time and prediction accuracy.